library(devtools)
library(rgdal)
library(GGally)
library(ggplot2)
library(plotly)
library(scales)
library(ggthemes)
library(RColorBrewer)
library(viridis)
library(grid)
library(gridExtra)
library(ggimage)
library(png)
library(gridGraphics)
library(dplyr)
library(tidyr)
#devtools::install_github('bart6114/artyfarty')
library('artyfarty')
library(tm)
library(wordcloud)

Part1

clutch = read.csv('C:/Users/hl3083/Downloads/EDA-20180420T211926Z-001/EDA/NBA_FINAL/NBA_data/fetched.csv')
#number of games played vs number of wins
df1 = clutch[,c('GP','W','team')]
df1= gather(df1,type,count,-team)
#df1$count <-  ifelse(df1$type =="W",df1$count*(-1),df1$count)
temp = df1[df1$type=='GP',]
new_levels=  as.character(temp[order(temp$count),]$team)
df1$team = factor(df1$team,levels=new_levels)
#df1 <- within(df1, team <- factor(team, levels=names(sort(count,  decreasing=TRUE))))
df1 %>% ggplot(aes(x=team, y=count, fill=type))+
  geom_bar(stat="identity",position="identity")+
  xlab("number of games")+ylab("name of teams")+
  scale_fill_manual(name="type of games",values = pal("five38"))+
  coord_flip()+ggtitle("number of games played (GP) v.s number of wins (W)")+
  geom_hline(yintercept=0)+
  ylab("number of games")+
  xlab("team name")+
  scale_y_continuous(breaks = pretty(df1$count),labels = abs(pretty(df1$count)))+
  theme_scientific()

#Personal fouls (PF) and turnovers (TOV)

df1 = clutch[,c('PF','TOV','team')]
df1= gather(df1,type,count,-team)
df1$count <-  ifelse(df1$type =="PF",df1$count*(-1),df1$count)
temp = temp = df1[df1$type=='TOV',]
new_levels=  as.character(temp[order(temp$count),]$team)
df1$team = factor(df1$team,levels=new_levels)
#df1 <- within(df1, team <- factor(team, levels=names(sort(count,  decreasing=TRUE))))
df1 %>% ggplot(aes(x=team, y=count, fill=type))+
  geom_bar(stat="identity",position="identity")+
  xlab("counts")+ylab("name of teams")+
  scale_fill_manual(values = pal("five38"))+
  coord_flip()+ggtitle("Personal fouls (PF) and turnovers (TOV)")+
  geom_hline(yintercept=0)+
  ylab("counts")+
  xlab("team name")+
  scale_y_continuous(breaks = pretty(df1$count),labels = abs(pretty(df1$count)))+
  theme_scientific()

# divergent plot
df1 = clutch[,c('PCT_PTS_2PT','PCT_PTS_3PT','PCT_PTS_FT','team')]
df1= gather(df1,type,count,-team)
temp =  df1[df1$type=='PCT_PTS_2PT',]
new_levels=  as.character(temp[order(temp$count),]$team)
df1$team = factor(df1$team,levels=new_levels)
df1$count <-  ifelse(df1$type =="PCT_PTS_2PT",df1$count*(-1),df1$count)

df1 %>% ggplot(aes(x=team, y=count, fill=type))+
  geom_col()+
  xlab("percentage")+ylab("name of teams")+
  scale_fill_manual(values = pal("five38"))+
  coord_flip()+ggtitle("2PT%,3PT%,FT%")+
  geom_hline(yintercept=0)+
  ylab("percentage")+
  xlab("team name")+
  scale_y_continuous(breaks = pretty(df1$count),labels = abs(pretty(df1$count)))+
  theme_scientific()

path = 'https://github.com/NiHaozheng/NBA-Visualization/blob/master/clutch_team/logo/'
#img <- "https://github.com/NiHaozheng/NBA-Visualization/blob/master/clutch_team/logo/ATL.png?raw=true"
df1 = clutch[,c('OFF_RATING','DEF_RATING','team')]
df1$img = paste(path,df1$team,'.png?raw=true',sep='')

ggplot(df1,aes(x=OFF_RATING,y=DEF_RATING))+geom_point()+
  scale_y_reverse()+geom_image(image = df1$img, size = .05)+
  theme_scientific()+
  ggtitle("offensive rating v.s. defensive rating")+
  xlab('offensive rating')+ylab('defensive rating')

### Part 2

## Preprocess data to merge with the team 
df_name_team = read.csv(file="C:/Users/hl3083/Downloads/EDA-20180420T211926Z-001/EDA/NBA_FINAL/NBA_data/Name_Team.csv")
df_name_team = df_name_team[,c("PERSON_ID","Team_Name")]
colnames(df_name_team)[1] = "player_id"


my_read = function(path,team=df_name_team){
  temp = read.csv(file=path)
  final = merge(temp,team,by = "player_id",all=TRUE)
  return(final[ ,!(colnames(final) == "X")])
}


df_3pct = my_read(path = "C:/Users/hl3083/Downloads/EDA-20180420T211926Z-001/EDA/NBA_FINAL/NBA_data/3pct_df.csv")
df_3fgm = my_read(path = "C:/Users/hl3083/Downloads/EDA-20180420T211926Z-001/EDA/NBA_FINAL/NBA_data/3fgm_df.csv")

df_3 = merge(df_3fgm,df_3pct,by = "player_id",all=TRUE)

df_pct = my_read(path = "C:/Users/hl3083/Downloads/EDA-20180420T211926Z-001/EDA/NBA_FINAL/NBA_data/pct_df.csv")
df_fgm = my_read(path = "C:/Users/hl3083/Downloads/EDA-20180420T211926Z-001/EDA/NBA_FINAL/NBA_data/fgm_df.csv")

df_all = merge(df_fgm,df_pct,by = "player_id",all=TRUE)

df_pts = my_read(path = "C:/Users/hl3083/Downloads/EDA-20180420T211926Z-001/EDA/NBA_FINAL/NBA_data/pts_df.csv")

df_fta = my_read(path = "C:/Users/hl3083/Downloads/EDA-20180420T211926Z-001/EDA/NBA_FINAL/NBA_data/fta_df.csv")

df_fct = my_read(path = "C:/Users/hl3083/Downloads/EDA-20180420T211926Z-001/EDA/NBA_FINAL/NBA_data/fct_df.csv")

df_ftm = my_read(path = "C:/Users/hl3083/Downloads/EDA-20180420T211926Z-001/EDA/NBA_FINAL/NBA_data/ftm_df.csv")
# Define FGA: Field Goal Attempt 
FGA = df_fgm$overall / df_fct$overall
# Define TSP: True shooting percent 
TSP = df_pts$overall/(2*(FGA+0.44*df_fta$overall))
df_pts['TSP'] = TSP
# Make a copy of df_pts
df_pts_v1 = df_pts
# Subset to remove all the NAs due to players that did not have a team or did not play in 2016
df_pts_v1_2 = df_pts_v1[!is.na(df_pts_v1$TSP),]

p_TSP = ggplot(df_pts_v1_2)+
  geom_point(aes(overall,TSP,color = player_name),size = 1)+
  facet_wrap(~Team_Name)+
  labs(title = "TSP V.S PTS ",x = 'Overall PTS', y='Overall TSP')
ggplotly(p_TSP)
p_TSP_All = ggplot(df_pts_v1_2)+
  geom_point(aes(overall,TSP,color = player_name,shape = Team_Name),size = 2)+
  labs(title = "TSP V.S PTS ",x = 'Overall PTS', y='Overall TSP')
ggplotly(p_TSP_All)
df_pct['df_fgm_overall']=df_fgm$overall
df_pct_v1 =  df_pct
df_pct_v1_2 = df_pct_v1[!is.na(df_fgm$player_name),]


p_FGMPCT = ggplot(df_pct_v1_2)+
  geom_point(aes(df_fgm_overall,overall,color = player_name),size = 1)+
  facet_wrap(~Team_Name)+
  labs(title = "pct_overall V.S fgm_overall ",x = 'fgm_overall', y='pct_overall')
ggplotly(p_FGMPCT)
p_FGMPCT_All = ggplot(df_pct_v1_2)+
  geom_point(aes(df_fgm_overall,overall,color = player_name,shape = Team_Name),size = 2)+
  labs(title = "pct_overall V.S fgm_overall ",x = 'fgm_overall', y='pct_overall')
ggplotly(p_FGMPCT_All)
df_3pct['df_3fgm_overall']=df_3fgm$overall
df_pct3_v1 =  df_3pct
df_pct3_v1_2 = df_pct3_v1[!is.na(df_3fgm$player_name),]


p_3FGM3PCT = ggplot(df_pct3_v1_2)+
  geom_point(aes(df_3fgm_overall,overall,color = player_name),size = 1)+
  facet_wrap(~Team_Name)+
  labs(title = "3pct_overall V.S 3fgm_overall ",x = '3fgm_overall', y='3pct_overall')
ggplotly(p_3FGM3PCT)
p_3FGM3PCT_All = ggplot(df_pct3_v1_2)+
  geom_point(aes(df_3fgm_overall,overall,color = player_name,shape = Team_Name),size = 2)+
  labs(title = "3pct_overall V.S 3fgm_overall ",x = '3fgm_overall', y='3pct_overall')
ggplotly(p_3FGM3PCT_All)
df_fta['df_ftm_30sec_plusmiuns_5'] = df_ftm$X30sec_plusminus_5
df_fta_v1 =  df_fta
df_fta_v1_2 = df_fta_v1[!is.na(df_fta$player_name),]

p_fta_ftm = ggplot(df_fta_v1_2)+
  geom_point(aes(X30sec_plusminus_5,df_ftm_30sec_plusmiuns_5,color = player_name),size = 1)+
  facet_wrap(~Team_Name)+
  labs(title = "df_ftm_30sec_plusmiuns_5 V.S X30sec_plusminus_5 ",x = 'X30sec_plusminus_5', y='df_ftm_30sec_plusmiuns_5')
ggplotly(p_fta_ftm)
p_fta_ftm = ggplot(df_fta_v1_2)+
  geom_point(aes(X30sec_plusminus_5,
                 df_ftm_30sec_plusmiuns_5,
                 color = player_name,
                 shape=Team_Name),
             size = 1.3,
             alpha=0.5)+
  labs(title = "df_ftm_30sec_plusmiuns_5 V.S X30sec_plusminus_5 ",x = 'X30sec_plusminus_5', y='df_ftm_30sec_plusmiuns_5')
ggplotly(p_fta_ftm)
p_fta_ftm = ggplot(df_fta_v1_2)+
  geom_point(aes(X30sec_plusminus_5,
                 df_ftm_30sec_plusmiuns_5,
                 color = player_name,
                 shape=Team_Name),
             size = 1.3,
             alpha=0.5,
            position = "jitter")+
  labs(title = "df_ftm_30sec_plusmiuns_5 V.S X30sec_plusminus_5 ",x = 'X30sec_plusminus_5', y='df_ftm_30sec_plusmiuns_5')
ggplotly(p_fta_ftm)
# average within group 3point


cbP = c("#999999", "#E69F00", "#56B4E9", "#009E73",
        "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

df_3fgm_sum = aggregate(df_3fgm[,3:12], list(df_3fgm$Team_Name), sum, na.rm = TRUE)
deno = df_3fgm/df_3pct[,1:13]
deno$player_name = df_3fgm$player_name
deno$player_id = df_3fgm$player_id
deno$Team_Name = df_3fgm$Team_Name
deno_modi = aggregate(deno[,3:12], list(deno$Team_Name), sum, na.rm = TRUE)
average3point = df_3fgm_sum/deno_modi
average3point$Group.1=deno_modi$Group.1
average3point[is.na(average3point)] = 0


TopLowTeam = c("Celtics","Cavaliers","Warriors","Spurs",
               "Lakers","Suns","76ers","Nets")
TopLow3point = average3point[average3point$Group.1 %in% TopLowTeam,]



p1 = ggparcoord(TopLow3point,
                columns = 2:7,
                groupColumn ='Group.1',
                scale = 'globalminmax')+
  geom_vline(xintercept = 0:6, color = "lightblue")+
  theme(axis.text.x=element_text(angle=90))+
  labs(title = "Average 3PT Last Xmin yDown Top4 V.S Low4",x = 'Indicator', y='Team Average')+
   scale_colour_colorblind()

p2 = ggparcoord(TopLow3point,
                columns = c(2,8:10),
                groupColumn ='Group.1',
                scale = 'globalminmax')+
  geom_vline(xintercept = 0:5, color = "lightblue")+
  theme(axis.text.x=element_text(angle=90))+
  labs(title = "Average 3PT Last Xmin yDownorHiger Top4 V.S Low4",x = 'Indicator', y='Team Average')+
  scale_colour_colorblind()
# average within group all point



cbP = c("#999999", "#E69F00", "#56B4E9", "#009E73",
        "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

df_fgm_sum = aggregate(df_fgm[,3:12], list(df_fgm$Team_Name), sum, na.rm = TRUE)
deno = df_fgm/df_pct[,1:13]
deno$player_name = df_fgm$player_name
deno$player_id = df_fgm$player_id
deno$Team_Name = df_fgm$Team_Name
deno_modi = aggregate(deno[,3:12], list(deno$Team_Name), sum, na.rm = TRUE)
averagepoint = df_fgm_sum/deno_modi
averagepoint$Group.1=deno_modi$Group.1
averagepoint[is.na(averagepoint)] = 0

TopLowTeam = c("Celtics","Cavaliers","Warriors","Spurs",
               "Lakers","Suns","76ers","Nets")
TopLowpoint = averagepoint[averagepoint$Group.1 %in% TopLowTeam,]
#averagepoint



p3 = ggparcoord(TopLowpoint,
                columns = 2:6,
                groupColumn ='Group.1',
                scale = 'globalminmax')+
  geom_vline(xintercept = 0:5, color = "lightblue")+
  theme(axis.text.x=element_text(angle=90))+
  labs(title = "Average TotalPT Last Xmin yDown Top4 V.S Low4",x = 'Indicator', y='Team Average')+
  scale_colour_colorblind()

p4 = ggparcoord(TopLowpoint,
                columns = c(2,7:10),
                groupColumn ='Group.1',
                scale = 'globalminmax')+
  geom_vline(xintercept = 0:5, color = "lightblue")+
  theme(axis.text.x=element_text(angle=90))+
  labs(title = "Average TotalPT Last Xmin yDownorHiger Top4 V.S Low4",x = 'Indicator', y='Team Average')+
  scale_colour_colorblind()


grid.arrange(p1, p2, p3, p4, nrow = 2)

path = 'https://github.com/NiHaozheng/NBA-Visualization/blob/master/clutch_team/logo/'
averagepoint$img = paste(path,averagepoint$Group.1,'.png?raw=true',sep='')
rmv_0_averagepoint = averagepoint[2:31,]

average3point$img = paste(path,average3point$Group.1,'.png?raw=true',sep='')
rmv_0_average3point = average3point[2:31,]

p1 = ggplot(rmv_0_averagepoint,aes(overall,X10sec_down_3))+
  geom_point()+
  geom_image(image = df1$img, size = .05)+
  theme_scientific()+
  labs(title = "3pt Average 10sec_down_3 v.s. Overall",x = 'Overall', y='X10sec_down_3')

p2 = ggplot(rmv_0_average3point,aes(overall,X10sec_down_3))+
  geom_point()+
  geom_image(image = df1$img, size = .05)+
  theme_scientific()+
  labs(title = "Total Average  X10sec_down_3 v.s. Overall",x = 'Overall', y='X10sec_down_3')
grid.arrange(p1, p2, nrow = 1)

Part3

ggplot()+
  geom_point(data =df_pct,
             aes(x = X1min_down_5, y= overall),
             position = position_jitter(w = 0.01, h = 0.02),
             alpha = 0.5,
             size = 3)+
  facet_wrap(~Team_Name)+
  labs(title = "overall V.S X1min_down_5",
       x = 'X1min_down_5', 
       y='overall')

pp = ggplot()+
  geom_point(data =df_all,
             aes(x = X5min_plusminus_5.x, y= X5min_plusminus_5.y,color = player_name.x),
             position = position_jitter(w = 0.01, h = 0.02),
             alpha = 0.5,
             size = 2)+
  facet_wrap(~Team_Name.x)+
  labs(title = "5min_plusminus_5_percent V.S X5min_plusminus_5_actual",
       x = 'X5min_plusminus_5_actual', 
       y='5min_plusminus_5_percent')
ggplotly(pp)
pairs(df_all[c("X10sec_down_3.x","X10sec_down_3.y","X30sec_down_3.x","X30sec_down_3.y")])

#df_all
pairs(df_all[c("X1min_down_5.x","X1min_down_5.y",
               "X3min._down_5.x","X3min._down_5.y",
               "X5min._down_5.x","X5min._down_5.y")])

#df_all
pairs(df_all[c("X30sec_plusminus_5.x","X30sec_plusminus_5.y",
               "X1min_plusminus_5.x","X1min_plusminus_5.y",
               "X3min_plusminus_5.x","X3min_plusminus_5.y")])

df_all$Team_Name.x = as.factor(df_all$Team_Name.x)
countorder = df_all %>% group_by(Team_Name.x) %>% summarize(av=mean(overall.x, na.rm=TRUE))

#df_all = merge(df_fgm,df_pct,by = "player_id",all=TRUE)
ggplot(countorder, aes(reorder(Team_Name.x,av),av)) + 
  geom_col(color = "tomato", fill = "orange", alpha = .2)+
  coord_flip()+
  theme_scientific()+
  labs(title = "Team Average Overall fgm",x = 'Team', y='Average Overall fgm')

### Part4

#TopLowTeam = c("Celtics","Cavaliers","Warriors","Spurs","Lakers","Suns","76ers","Nets")

tweet_content = readr::read_file("C:/Users/hl3083/Downloads/EDA-20180420T211926Z-001/EDA/NBA_FINAL/NBA_data/Twitter/tweet_content.txt")
Spurs_tweet_content = readr::read_file("C:/Users/hl3083/Downloads/EDA-20180420T211926Z-001/EDA/NBA_FINAL/NBA_data/Twitter/By Team/Spurs.txt")
Warriors_tweet_content = readr::read_file("C:/Users/hl3083/Downloads/EDA-20180420T211926Z-001/EDA/NBA_FINAL/NBA_data/Twitter/By Team/Warriors.txt")
Lakers_tweet_content = readr::read_file("C:/Users/hl3083/Downloads/EDA-20180420T211926Z-001/EDA/NBA_FINAL/NBA_data/Twitter/By Team/Lakers.txt")
T76ers_tweet_content = readr::read_file("C:/Users/hl3083/Downloads/EDA-20180420T211926Z-001/EDA/NBA_FINAL/NBA_data/Twitter/By Team/76ers.txt")
My_word_cloud = function(tweet_content,min_freq){
  docs = Corpus(VectorSource(tweet_content)) %>%
  tm_map(removePunctuation) %>%
  tm_map(removeNumbers) %>%
  tm_map(tolower)  %>%
  tm_map(removeWords, stopwords("english")) %>%
  tm_map(stripWhitespace) %>%
  tm_map(PlainTextDocument)

  tdm = TermDocumentMatrix(docs) %>%
  as.matrix()

  content = as.matrix(tdm[,1])
  content = as.matrix(content[order(content, decreasing=TRUE),])

  print("head(Whole twitter)")
  print(head(content))
  print("Whole twitter's most occuring words:")
  print(head(rownames(content)))

  pal <- brewer.pal(9, "YlGnBu")
  pal <- pal[-(1:3)]

  wordcloud(rownames(content), content, min.freq =min_freq, scale=c(5, .2), random.order = FALSE, random.color = FALSE, colors= pal)
}
## Let's look at what is going on if we plot the twitter!
My_word_cloud(tweet_content = tweet_content,min_freq=100)
## [1] "head(Whole twitter)"
##          [,1]
## ontnt    1539
## warriors 1367
## pts      1337
## nba      1238
## spurs    1160
## player   1072
## [1] "Whole twitter's most occuring words:"
## [1] "ontnt"    "warriors" "pts"      "nba"      "spurs"    "player"

## Let's look at what is going on WITH TEAM



splited_Spurs = strsplit(Spurs_tweet_content, "\n")
splited_Spurs_2 = split(splited_Spurs[[1]], 1:2)

tweet_time = c()
for (i in 1:length(splited_Spurs_2[[1]])){
  tweet_time = c(tweet_time,strsplit(splited_Spurs_2[[1]][i]," ")[[1]][4])
}
Spurs_df = data.frame("Date"=tweet_time,"content"=splited_Spurs_2[[2]])


sp = paste(splited_Spurs_2[[2]], collapse=' ')
My_word_cloud(tweet_content = sp, min_freq = 21)
## [1] "head(Whole twitter)"
##          [,1]
## spurs    2476
## man       414
## utd       323
## beat      286
## warriors  229
## game      227
## [1] "Whole twitter's most occuring words:"
## [1] "spurs"    "man"      "utd"      "beat"     "warriors" "game"

splited_Spurs = strsplit(Warriors_tweet_content, "\n")
splited_Spurs_2 = split(splited_Spurs[[1]], 1:2)

tweet_time = c()
for (i in 1:length(splited_Spurs_2[[1]])){
  tweet_time = c(tweet_time,strsplit(splited_Spurs_2[[1]][i]," ")[[1]][4])
}
Spurs_df = data.frame("Date"=tweet_time,"content"=splited_Spurs_2[[2]])


sp = paste(splited_Spurs_2[[2]], collapse=' ')
My_word_cloud(tweet_content = sp, min_freq = 21)
## [1] "head(Whole twitter)"
##          [,1]
## warriors 2248
## curry     256
## game      255
## spurs     232
## will      215
## stephen   187
## [1] "Whole twitter's most occuring words:"
## [1] "warriors" "curry"    "game"     "spurs"    "will"     "stephen"

splited_Spurs = strsplit(Lakers_tweet_content, "\n")
splited_Spurs_2 = split(splited_Spurs[[1]], 1:2)

tweet_time = c()
for (i in 1:length(splited_Spurs_2[[1]])){
  tweet_time = c(tweet_time,strsplit(splited_Spurs_2[[1]][i]," ")[[1]][4])
}
Spurs_df = data.frame("Date"=tweet_time,"content"=splited_Spurs_2[[2]])


sp = paste(splited_Spurs_2[[2]], collapse=' ')
My_word_cloud(tweet_content = sp, min_freq = 21)
## [1] "head(Whole twitter)"
##         [,1]
## lakers   908
## kawhi    224
## leonard  131
## lebron    95
## trade     92
## los       91
## [1] "Whole twitter's most occuring words:"
## [1] "lakers"  "kawhi"   "leonard" "lebron"  "trade"   "los"

splited_Spurs = strsplit(T76ers_tweet_content, "\n")
splited_Spurs_2 = split(splited_Spurs[[1]], 1:2)

tweet_time = c()
for (i in 1:length(splited_Spurs_2[[1]])){
  tweet_time = c(tweet_time,strsplit(splited_Spurs_2[[1]][i]," ")[[1]][4])
}
Spurs_df = data.frame("Date"=tweet_time,"content"=splited_Spurs_2[[2]])


sp = paste(splited_Spurs_2[[2]], collapse=' ')
My_word_cloud(tweet_content = sp, min_freq = 10)
## [1] "head(Whole twitter)"
##              [,1]
## ers           532
## heat          117
## game           98
## nba            87
## embiid         85
## philadelphia   82
## [1] "Whole twitter's most occuring words:"
## [1] "ers"          "heat"         "game"         "nba"         
## [5] "embiid"       "philadelphia"